##### ########################################################## ######
#####                                                            ######
#####   Input: debates/sentence-level style scores               ######
#####   Output: mp-by-debate style scores                           
#####                                                            ######
##### ########################################################## ######

rm(list=ls())
range01 <- function(x){(x-min(x))/(max(x)-min(x))}

# Load libraries

library(quanteda) # v3.0.0
library(data.table) # v1.13.6
library(plyr) # v1.8.6
library(texreg) # v1.37.5
library(ggplot2) # v3.3.3
library(plm) # v2.2-3
library(corrplot) # v0.84

# Load data 

load("data/debates.Rdata") # Raw debate information
load("working/dictionaries_sentence.Rdata") # Sentence-level dictionary scores
load("working/complexity_speech.Rdata") # Speech-level complexity scores
load("working/repetition_mp_debate.Rdata") # Mp-debate level repetition scores

education_occupation <- read.csv("data/mp_data/final_occupation_education_v2.csv") #  Education and occupation 

## Convert all data to individual-by-debate 

ep_id_to_person_debate_ids <- debates[,list(person_id = unique(person_id),
                                            section_id = unique(section_id)),by = epobject_id]

dictionary_scores <- merge(dictionary_scores, ep_id_to_person_debate_ids, by = "epobject_id")
complexity_scores <- merge(complexity_scores, ep_id_to_person_debate_ids, by = "epobject_id")

# Aggregate sentence-level dictionary scores to mp-debate level

speech_scores <- dictionary_scores[,list(affect = weighted.mean(glove_affect/n_words, n_words),
                                         posemo = weighted.mean(glove_posemo/n_words, n_words),
                                         negemo = weighted.mean(glove_negemo/n_words, n_words),
                                         fact = weighted.mean(glove_fact/n_words, n_words),
                                         anecdote = weighted.mean(glove_anecdote/n_words, n_words), 
                                         aggression = weighted.mean(glove_aggression/n_words, n_words),
                                         dic_affect = sum(dic_affect)/sum(n_words),
                                         dic_posemo = sum(dic_posemo)/sum(n_words),
                                         dic_negemo = sum(dic_negemo)/sum(n_words),
                                         dic_fact = sum(dic_fact)/sum(n_words),
                                         dic_anecdote = sum(dic_anecdote)/sum(n_words), 
                                         dic_aggression = sum(dic_aggression)/sum(n_words)),
                                   by = list(person_id, section_id)]

# Correct complexity polarity

complexity_scores$complexity <- complexity_scores$complexity * -1

# Aggregate speech-level complexity scores to the mp-debate level

complexity_scores <- merge(complexity_scores, debates[,c("epobject_id","n_words")])

complexity_scores <- complexity_scores[,list(complexity = weighted.mean(complexity,n_words, na.rm = T)),
                                       by = list(person_id, section_id)]

## Merge all scores

speech_scores <- merge(speech_scores, complexity_scores, by = c("person_id","section_id"), all = T)

speech_scores <- merge(speech_scores, repetition_scores, by = c("person_id","section_id"), all = T)

## Add debate meta data

debates$years_in_parliament <- debates$days_in_house / 365 

debates$joiners <- debates$days_in_house <= 365

debates$diff_in_days <- difftime(debates$left_house, debates$hdate, unit = c("days"))
debates$leavers <- debates$diff_in_days <= 365

debates$is_gov_minister <- (debates$holds_government_position & !debates$attends_cabinet)
debates$is_opp_minister <- (debates$holds_opposition_position & !debates$attends_shadow_cabinet)

## Debate types 

# QT
debates$question_time <- grepl("Oral answers to question",debates$parent, ignore.case = T)

# PMQs
debates$pm_question_time <- debates$parent == "Oral Answers to Questions:  Prime Minister"

# Opposition Day & Backbench Business 
opp_bbb <- c("Opposition Day", "Opposition Day:", "Backbench Business", "Backbench Business:", "Adjournment", "Private Members' Bills", 
             "Remaining Private Members' Bills")
debates$backbench_opposition_day <- grepl(paste(opp_bbb, collapse = "|"), debates$parent, ignore.case = TRUE)

# Procedural Debates
procedure <- c("Orders of the Day", "Orders of the Day:", "Business of the House", "House Procedures", "Point of Order", 
               "Business without Debate", "Speaker's Statement", "New Member", "Privilege", "Estimates", "Estimates Day")
debates$procedure <- grepl(paste(procedure, collapse = "|"), debates$parent, ignore.case = TRUE)

# Legislation 
debates$legislation <- grepl("Bill", debates$parent, ignore.case = TRUE)

# Petitions 
petitions <- c("Petitions", "Petitions:")
debates$petitions <- grepl(paste(petitions, collapse = "|"), debates$parent, ignore.case = TRUE) 

debates$debate_type <- "Other"
debates$debate_type[debates$question_time] <- "Questions"
debates$debate_type[debates$pm_question_time] <- "PMQs"
debates$debate_type[debates$legislation] <- "Legislation"
debates$debate_type[debates$backbench_opposition_day] <- "Opposition/Backbench"
debates$debate_type[debates$procedure] <- "Procedure"

debates$debate_type <- factor(debates$debate_type, levels = c("Other", "Questions", "PMQs", "Procedure", "Opposition/Backbench", "Legislation"))

# Proportion of women 
debates[,prop_women := sum(gender == "Female")/.N,by = section_id]

## Add female leader variable

debates[,female_leader_present := as.logical("Female"%in%unique(gender[attends_cabinet|attends_shadow_cabinet])),by = section_id]

## Add cohort

cohort <- debates[,list(hdate = unique(house_start)),by = person_id]

cohort$cohort <- "2017-2019"
cohort[hdate < as.Date("2017-06-08") & hdate >= as.Date("2015-05-07")]$cohort <- "2015-2017"
cohort[hdate < as.Date("2015-05-07") & hdate >= as.Date("2010-05-06")]$cohort <- "2010-2015"
cohort[hdate < as.Date("2010-05-06") & hdate >= as.Date("2005-05-05")]$cohort <- "2005-2010"
cohort[hdate < as.Date("2005-05-05") & hdate >= as.Date("2001-06-07")]$cohort <- "2001-2005"
cohort[hdate < as.Date("2001-06-07") & hdate >= as.Date("1997-05-01")]$cohort <- "1997-2001"
cohort[hdate < as.Date("1997-05-01") & hdate >= as.Date("1992-04-09")]$cohort <- "1992-1997"
cohort[hdate < as.Date("1992-04-09") & hdate >= as.Date("1987-06-11")]$cohort <- "1987-1992"
cohort[hdate < as.Date("1987-06-11") & hdate >= as.Date("1983-06-09")]$cohort <- "1983-1987"
cohort[hdate < as.Date("1983-06-09") & hdate >= as.Date("1979-05-03")]$cohort <- "1979-1983"
cohort[hdate < as.Date("1979-05-03")]$parliamentary_term <- "1974-1979"

cohort$cohort <- factor(cohort$cohort, levels = c("1979-1983", "1983-1987", "1987-1992", "1992-1997", "1997-2001", "2001-2005", "2005-2010", "2010-2015","2015-2017","2017-2019"))

debates <- merge(debates, cohort[,c(1,3)], by = "person_id", all = T)

## Add education and occupation

education_occupation$has_degree[is.na(education_occupation$has_degree)] <- TRUE # 3 MPs missing, imputed as graduates

debates <- merge(debates, education_occupation[,names(education_occupation)!="name"], by = c("person_id"), all = TRUE)

## Convert to MP by debate

mp_by_debate <- debates[,list(name = unique(name),
                              parent = unique(parent),
                              party = unique(party_short),
                              is_gov_minister = unique(is_gov_minister),
                              is_opp_minister = unique(is_opp_minister),
                              attends_cabinet = unique(attends_cabinet),
                              attends_shadow_cabinet = unique(attends_shadow_cabinet),
                              years_in_parliament = unique(years_in_parliament),
                              age_years = unique(age_years),
                              is_committee_chair = unique(is_committee_chair),
                              is_speaker = unique(is_speaker),
                              gender = unique(gender),
                              cohort = unique(cohort),
                              question_time = unique(question_time),
                              female_leader_present = unique(female_leader_present),
                              pm_question_time = unique(pm_question_time),
                              year = unique(year),
                              session = unique(session),
                              yearmon = unique(yearmon),
                              hdate = unique(hdate), 
                              parliamentary_term = unique(parliamentary_term),
                              procedure = unique(procedure), 
                              legislation = unique(legislation),
                              backbench_opposition = unique(backbench_opposition_day),
                              joiners = unique(joiners), 
                              leavers = unique(leavers),
                              debate_type = unique(debate_type), 
                              petitions = unique(petitions), 
                              margin = unique(margin), 
                              highest_education = unique(highest_education), 
                              has_degree = unique(has_degree), 
                              occupation_class = unique(occupation_class), 
                              occupation = unique(occupation_new)
),by = list(person_id, section_id)]

speech_scores <- merge(speech_scores, mp_by_debate, by = c("person_id", "section_id"))

## Missingness in the dictionary variables comes entirely from short speeches excluded in apply_dictionaries

# Drop any mp-debate level observation with fewer than 50 words

speech_scores <- speech_scores[!is.na(speech_scores$affect)]

# Standardize all outcome variables to the standard normal

stdize <- function(x) (x - mean(x, na.rm = T))/sd(x, na.rm = T)
speech_scores$affect_std <- stdize(speech_scores$affect)
speech_scores$posemo_std <- stdize(speech_scores$posemo)
speech_scores$negemo_std <- stdize(speech_scores$negemo)
speech_scores$fact_std <- stdize(speech_scores$fact)
speech_scores$anecdote_std <- stdize(speech_scores$anecdote)
speech_scores$aggression_std <- stdize(speech_scores$aggression)
speech_scores$complexity_std <- stdize(speech_scores$complexity)
speech_scores$repetition_std <- stdize(speech_scores$repetition)

speech_scores$affect_std_dic <- stdize(speech_scores$dic_affect)
speech_scores$posemo_std_dic <- stdize(speech_scores$dic_posemo)
speech_scores$negemo_std_dic <- stdize(speech_scores$dic_negemo)
speech_scores$fact_std_dic <- stdize(speech_scores$dic_fact)
speech_scores$anecdote_std_dic <- stdize(speech_scores$dic_anecdote)
speech_scores$aggression_std_dic <- stdize(speech_scores$dic_aggression)

speech_scores <- speech_scores[!is.na(speech_scores$complexity_std)]

save(speech_scores, file =  "working/speech_scores.Rdata")
